In [1]:
pip install pandas
Requirement already satisfied: pandas in d:\anaconda\lib\site-packages (2.0.3)
Requirement already satisfied: python-dateutil>=2.8.2 in d:\anaconda\lib\site-packages (from pandas) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in d:\anaconda\lib\site-packages (from pandas) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in d:\anaconda\lib\site-packages (from pandas) (2023.3)
Requirement already satisfied: numpy>=1.21.0 in d:\anaconda\lib\site-packages (from pandas) (1.24.3)
Requirement already satisfied: six>=1.5 in d:\anaconda\lib\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)
Note: you may need to restart the kernel to use updated packages.
In [3]:
pip install numpy
Requirement already satisfied: numpy in d:\anaconda\lib\site-packages (1.24.3)
Note: you may need to restart the kernel to use updated packages.
In [4]:
pip install plotly
Requirement already satisfied: plotly in d:\anaconda\lib\site-packages (5.9.0)
Requirement already satisfied: tenacity>=6.2.0 in d:\anaconda\lib\site-packages (from plotly) (8.2.2)
Note: you may need to restart the kernel to use updated packages.
In [2]:
pip install sklearn
Collecting sklearnNote: you may need to restart the kernel to use updated packages.
  error: subprocess-exited-with-error
  
  python setup.py egg_info did not run successfully.
  exit code: 1
  
  [15 lines of output]
  The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
  rather than 'sklearn' for pip commands.
  
  Here is how to fix this error in the main use cases:
  - use 'pip install scikit-learn' rather than 'pip install sklearn'
  - replace 'sklearn' by 'scikit-learn' in your pip requirements files
    (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
  - if the 'sklearn' package is used by one of your dependencies,
    it would be great if you take some time to track which package uses
    'sklearn' instead of 'scikit-learn' and report it to their issue tracker
  - as a last resort, set the environment variable
    SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
  
  More information is available at
  https://github.com/scikit-learn/sklearn-pypi-package
  [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
error: metadata-generation-failed

Encountered error while generating package metadata.

See above for output.

note: This is an issue with the package mentioned above, not pip.
hint: See above for details.
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'
In [5]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
In [13]:
df = pd.read_csv('D:/MY DESTINATION/PROJECTS/Projects/Cancer_breast_survive_prediction/BRCA.csv')
In [14]:
df.head()
Out[14]:
Patient_ID Age Gender Protein1 Protein2 Protein3 Protein4 Tumour_Stage Histology ER status PR status HER2 status Surgery_type Date_of_Surgery Date_of_Last_Visit Patient_Status
0 TCGA-D8-A1XD 36.0 FEMALE 0.080353 0.42638 0.54715 0.273680 III Infiltrating Ductal Carcinoma Positive Positive Negative Modified Radical Mastectomy 15-Jan-17 19-Jun-17 Alive
1 TCGA-EW-A1OX 43.0 FEMALE -0.420320 0.57807 0.61447 -0.031505 II Mucinous Carcinoma Positive Positive Negative Lumpectomy 26-Apr-17 09-Nov-18 Dead
2 TCGA-A8-A079 69.0 FEMALE 0.213980 1.31140 -0.32747 -0.234260 III Infiltrating Ductal Carcinoma Positive Positive Negative Other 08-Sep-17 09-Jun-18 Alive
3 TCGA-D8-A1XR 56.0 FEMALE 0.345090 -0.21147 -0.19304 0.124270 II Infiltrating Ductal Carcinoma Positive Positive Negative Modified Radical Mastectomy 25-Jan-17 12-Jul-17 Alive
4 TCGA-BH-A0BF 56.0 FEMALE 0.221550 1.90680 0.52045 -0.311990 II Infiltrating Ductal Carcinoma Positive Positive Negative Other 06-May-17 27-Jun-19 Dead
In [15]:
df.isnull().sum()
Out[15]:
Patient_ID             7
Age                    7
Gender                 7
Protein1               7
Protein2               7
Protein3               7
Protein4               7
Tumour_Stage           7
Histology              7
ER status              7
PR status              7
HER2 status            7
Surgery_type           7
Date_of_Surgery        7
Date_of_Last_Visit    24
Patient_Status        20
dtype: int64
In [17]:
df.shape
Out[17]:
(341, 16)
In [18]:
df= df.dropna()
In [19]:
df.isnull().sum()
Out[19]:
Patient_ID            0
Age                   0
Gender                0
Protein1              0
Protein2              0
Protein3              0
Protein4              0
Tumour_Stage          0
Histology             0
ER status             0
PR status             0
HER2 status           0
Surgery_type          0
Date_of_Surgery       0
Date_of_Last_Visit    0
Patient_Status        0
dtype: int64
In [20]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 317 entries, 0 to 333
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Patient_ID          317 non-null    object 
 1   Age                 317 non-null    float64
 2   Gender              317 non-null    object 
 3   Protein1            317 non-null    float64
 4   Protein2            317 non-null    float64
 5   Protein3            317 non-null    float64
 6   Protein4            317 non-null    float64
 7   Tumour_Stage        317 non-null    object 
 8   Histology           317 non-null    object 
 9   ER status           317 non-null    object 
 10  PR status           317 non-null    object 
 11  HER2 status         317 non-null    object 
 12  Surgery_type        317 non-null    object 
 13  Date_of_Surgery     317 non-null    object 
 14  Date_of_Last_Visit  317 non-null    object 
 15  Patient_Status      317 non-null    object 
dtypes: float64(5), object(11)
memory usage: 42.1+ KB
In [21]:
#Gender column to see how many females and males are there

print(df.Gender.value_counts())
Gender
FEMALE    313
MALE        4
Name: count, dtype: int64
In [22]:
# stage of tumour of the patients
stage  =  df["Tumour_Stage"].value_counts()
print(stage)
Tumour_Stage
II     180
III     77
I       60
Name: count, dtype: int64
In [23]:
stage  =  df["Tumour_Stage"].value_counts()
transactions = stage.index
quantity = stage.values
figure = px.pie(df, values=quantity, names= transactions, hole=0.5, title="Tumour Stages of Patients" )
figure.show()
In [24]:
# Histology
histology = df["Histology"].value_counts()
transactions = histology.index
quantity = histology.values
figure = px.pie(df, 
             values=quantity, 
             names=transactions,hole = 0.5, 
             title="Histology of Patients")
figure.show()
In [25]:
#ER status, PR status, and HER2 status of the patients
# ER status
print(df["ER status"].value_counts())
# PR status
print(df["PR status"].value_counts())
# HER2 status
print(df["HER2 status"].value_counts())
ER status
Positive    317
Name: count, dtype: int64
PR status
Positive    317
Name: count, dtype: int64
HER2 status
Negative    288
Positive     29
Name: count, dtype: int64
In [26]:
#surgery_type

# Surgery_type
surgery = df["Surgery_type"].value_counts()
transactions = surgery.index
quantity = surgery.values
figure = px.pie(df, 
             values=quantity, 
             names=transactions,hole = 0.5, 
             title="Type of Surgery of Patients")
figure.show()
In [27]:
df.head()
Out[27]:
Patient_ID Age Gender Protein1 Protein2 Protein3 Protein4 Tumour_Stage Histology ER status PR status HER2 status Surgery_type Date_of_Surgery Date_of_Last_Visit Patient_Status
0 TCGA-D8-A1XD 36.0 FEMALE 0.080353 0.42638 0.54715 0.273680 III Infiltrating Ductal Carcinoma Positive Positive Negative Modified Radical Mastectomy 15-Jan-17 19-Jun-17 Alive
1 TCGA-EW-A1OX 43.0 FEMALE -0.420320 0.57807 0.61447 -0.031505 II Mucinous Carcinoma Positive Positive Negative Lumpectomy 26-Apr-17 09-Nov-18 Dead
2 TCGA-A8-A079 69.0 FEMALE 0.213980 1.31140 -0.32747 -0.234260 III Infiltrating Ductal Carcinoma Positive Positive Negative Other 08-Sep-17 09-Jun-18 Alive
3 TCGA-D8-A1XR 56.0 FEMALE 0.345090 -0.21147 -0.19304 0.124270 II Infiltrating Ductal Carcinoma Positive Positive Negative Modified Radical Mastectomy 25-Jan-17 12-Jul-17 Alive
4 TCGA-BH-A0BF 56.0 FEMALE 0.221550 1.90680 0.52045 -0.311990 II Infiltrating Ductal Carcinoma Positive Positive Negative Other 06-May-17 27-Jun-19 Dead
In [28]:
df["Tumour_Stage"] = df["Tumour_Stage"].map({"I": 1, "II": 2, "III": 3})
In [29]:
df.head()
Out[29]:
Patient_ID Age Gender Protein1 Protein2 Protein3 Protein4 Tumour_Stage Histology ER status PR status HER2 status Surgery_type Date_of_Surgery Date_of_Last_Visit Patient_Status
0 TCGA-D8-A1XD 36.0 FEMALE 0.080353 0.42638 0.54715 0.273680 3 Infiltrating Ductal Carcinoma Positive Positive Negative Modified Radical Mastectomy 15-Jan-17 19-Jun-17 Alive
1 TCGA-EW-A1OX 43.0 FEMALE -0.420320 0.57807 0.61447 -0.031505 2 Mucinous Carcinoma Positive Positive Negative Lumpectomy 26-Apr-17 09-Nov-18 Dead
2 TCGA-A8-A079 69.0 FEMALE 0.213980 1.31140 -0.32747 -0.234260 3 Infiltrating Ductal Carcinoma Positive Positive Negative Other 08-Sep-17 09-Jun-18 Alive
3 TCGA-D8-A1XR 56.0 FEMALE 0.345090 -0.21147 -0.19304 0.124270 2 Infiltrating Ductal Carcinoma Positive Positive Negative Modified Radical Mastectomy 25-Jan-17 12-Jul-17 Alive
4 TCGA-BH-A0BF 56.0 FEMALE 0.221550 1.90680 0.52045 -0.311990 2 Infiltrating Ductal Carcinoma Positive Positive Negative Other 06-May-17 27-Jun-19 Dead
In [30]:
df["Histology"] = df["Histology"].map({"Infiltrating Ductal Carcinoma": 1, 
                                           "Infiltrating Lobular Carcinoma": 2, "Mucinous Carcinoma": 3})
df.head()
Out[30]:
Patient_ID Age Gender Protein1 Protein2 Protein3 Protein4 Tumour_Stage Histology ER status PR status HER2 status Surgery_type Date_of_Surgery Date_of_Last_Visit Patient_Status
0 TCGA-D8-A1XD 36.0 FEMALE 0.080353 0.42638 0.54715 0.273680 3 1 Positive Positive Negative Modified Radical Mastectomy 15-Jan-17 19-Jun-17 Alive
1 TCGA-EW-A1OX 43.0 FEMALE -0.420320 0.57807 0.61447 -0.031505 2 3 Positive Positive Negative Lumpectomy 26-Apr-17 09-Nov-18 Dead
2 TCGA-A8-A079 69.0 FEMALE 0.213980 1.31140 -0.32747 -0.234260 3 1 Positive Positive Negative Other 08-Sep-17 09-Jun-18 Alive
3 TCGA-D8-A1XR 56.0 FEMALE 0.345090 -0.21147 -0.19304 0.124270 2 1 Positive Positive Negative Modified Radical Mastectomy 25-Jan-17 12-Jul-17 Alive
4 TCGA-BH-A0BF 56.0 FEMALE 0.221550 1.90680 0.52045 -0.311990 2 1 Positive Positive Negative Other 06-May-17 27-Jun-19 Dead
In [31]:
df["ER status"] = df["ER status"].map({"Positive": 1})
df["PR status"] = df["PR status"].map({"Positive": 1})
In [32]:
df["HER2 status"] = df["HER2 status"].map({"Positive": 1, "Negative": 2})
df["Gender"] = df["Gender"].map({"MALE": 0, "FEMALE": 1})
df["Surgery_type"] = df["Surgery_type"].map({"Other": 1, "Modified Radical Mastectomy": 2, 
                                                 "Lumpectomy": 3, "Simple Mastectomy": 4})
print(df.head())
     Patient_ID   Age  Gender  Protein1  Protein2  Protein3  Protein4  \
0  TCGA-D8-A1XD  36.0       1  0.080353   0.42638   0.54715  0.273680   
1  TCGA-EW-A1OX  43.0       1 -0.420320   0.57807   0.61447 -0.031505   
2  TCGA-A8-A079  69.0       1  0.213980   1.31140  -0.32747 -0.234260   
3  TCGA-D8-A1XR  56.0       1  0.345090  -0.21147  -0.19304  0.124270   
4  TCGA-BH-A0BF  56.0       1  0.221550   1.90680   0.52045 -0.311990   

   Tumour_Stage  Histology  ER status  PR status  HER2 status  Surgery_type  \
0             3          1          1          1            2             2   
1             2          3          1          1            2             3   
2             3          1          1          1            2             1   
3             2          1          1          1            2             2   
4             2          1          1          1            2             1   

  Date_of_Surgery Date_of_Last_Visit Patient_Status  
0       15-Jan-17          19-Jun-17          Alive  
1       26-Apr-17          09-Nov-18           Dead  
2       08-Sep-17          09-Jun-18          Alive  
3       25-Jan-17          12-Jul-17          Alive  
4       06-May-17          27-Jun-19           Dead  
In [44]:
status  =  df["Patient_Status"].value_counts()
status
Out[44]:
Patient_Status
Alive    255
Dead      62
Name: count, dtype: int64
In [33]:
x = np.array(df[['Age', 'Gender', 'Protein1', 'Protein2', 'Protein3','Protein4', 
                   'Tumour_Stage', 'Histology', 'ER status', 'PR status', 
                   'HER2 status', 'Surgery_type']])
y = np.array(df['Patient_Status'])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.10, random_state=42)
In [34]:
model = SVC()
model.fit(x_train, y_train)
Out[34]:
SVC()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC()
In [35]:
model.score(x_test,y_test)
Out[35]:
0.8125
In [46]:
# Prediction
# features = [['Age', 'Gender', 'Protein1', 'Protein2', 'Protein3','Protein4', 'Tumour_Stage', 'Histology', 'ER status', 'PR status', 'HER2 status', 'Surgery_type']]
features = np.array([[74, 1, -0.080353, -0.420320,  0.61447 , -0.273680, 3, 3, 0, 0, 2, 1,]])
print(model.predict(features))
['Alive']
In [39]:
import tkinter as tk
from tkinter import ttk
import numpy as np
from sklearn.svm import SVC

def predict():
    features = [
        float(entries['Age'].get()),
        int(entries['Gender'].get()),
        float(entries['Protein1'].get()),
        float(entries['Protein2'].get()),
        float(entries['Protein3'].get()),
        float(entries['Protein4'].get()),
        int(entries['Tumour Stage'].get()),
        int(entries['Histology'].get()),
        int(entries['ER status'].get()),
        int(entries['PR status'].get()),
        int(entries['HER2 status'].get()),
        int(entries['Surgery Type'].get())
    ]

    features = [features]  # Convert to a 2D array

    print("Input Features:", features)

    prediction = model.predict(features)
    print("Predicted Status:", prediction)

    result_label.config(text=f"Predicted Status: {prediction[0]}")



# Create the main window
window = tk.Tk()
window.title("Breast Cancer Survival Prediction")

# Create labels and entry widgets for each feature
labels = ['Age', 'Gender', 'Protein1', 'Protein2', 'Protein3', 'Protein4',
          'Tumour Stage', 'Histology', 'ER status', 'PR status',
          'HER2 status', 'Surgery Type']

entries = {}

for i, label in enumerate(labels):
    ttk.Label(window, text=label).grid(row=i, column=0, padx=10, pady=5)
    entries[label] = ttk.Entry(window, textvariable=tk.StringVar(), width=20)
    entries[label].grid(row=i, column=1)

# Create a button to trigger prediction
predict_button = ttk.Button(window, text="Predict", command=predict)
predict_button.grid(row=len(labels) + 1, column=0, columnspan=2, pady=10)

# Create a label to display the prediction result
result_label = ttk.Label(window, text="")
result_label.grid(row=len(labels) + 2, column=0, columnspan=2, pady=5)

# Run the main loop
window.mainloop()
Input Features: [[43.0, 1, -0.42032, 0.57807, 0.61447, -0.031505, 2, 3, 1, 1, 2, 3]]
Predicted Status: ['Alive']
Input Features: [[43.0, 1, -0.42032, 0.57807, 0.61447, -0.031505, 2, 3, 1, 1, 2, 3]]
Predicted Status: ['Alive']
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: